import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import math
import plotly.graph_objects as pl
#Getting the current working directory
os.getcwd()
#Changing the working directory location to load the dataset.
os.chdir('/Users/aishwaryamaddimsetty/Downloads')
#Importing the dataset from my local computer and assigning it to a variable:
data = pd.read_csv('DSA_DataSet.csv')
#Printing the first 5 rows of the dataset - just a look at the data
data.head(5)
#Printing the number of rows and columns of the dataset:
data.shape
#Printing the details about the columns of the dataset:
data.info()
#Printing the datatypes of the columns:
data.dtypes
#Printing the basic summary statistics of the dataset:
data.describe()
#Printing the number of nulls in the dataset (though we know from above step(data.info) we have all non null columns):
data.isnull().sum()
#Small code to get counts of each category present in the data's catogorical columns:
category_column = [i for i in data.columns if data[i].dtypes == 'object']
for column in category_column:
print(column, '\n\n')
print(data[column].value_counts())
print("---" *20)
#Used the term 'target' as the variable, since that is our result
target_count = data['y'].value_counts()
target_count
#Analysing the column ('y') visually - Yess and Nos
colors = ['Red', 'Green']
trace = pl.Pie(labels =target_count.index, values = target_count.values, pull= [0.05], marker=dict(colors=colors))
layout = pl.Layout(title = "Subscribed to the Term Deposit", height = 200, legend= dict(x=1.1, y=1.3))
fig = pl.Figure(data=[trace], layout = layout)
fig.update_layout(height=500, width=600)
fig.show()
# JOB
sns.set_style('whitegrid')
plt.figure(figsize=(14,7))
sns.countplot(data['job'])
plt.figure(figsize = (10,5))
ax = sns.countplot(x='job', hue= 'y', data = data, palette = 'Set1')
ax.set_xticklabels(ax.get_xticklabels(), rotation = 40, ha = "right", size=15)
plt.tight_layout()
plt.show()
# MARITAL
sns.set_style('whitegrid')
plt.figure(figsize=(14,7))
sns.countplot(data['marital'])
plt.figure(figsize = (10,5))
ax = sns.countplot(x='marital', hue= 'y', data = data, palette = 'Set1')
ax.set_xticklabels(ax.get_xticklabels(), rotation = 40, ha = "right")
plt.tight_layout()
plt.show()
#DEFAULT
sns.set_style('whitegrid')
plt.figure(figsize=(14,7))
sns.countplot(data['default'])
plt.figure(figsize = (10,5))
ax = sns.countplot(x='default', hue= 'y', data = data, palette = 'Set1')
ax.set_xticklabels(ax.get_xticklabels(), rotation = 40, ha = "right")
plt.tight_layout()
plt.show()
#EDUCATION
sns.set_style('whitegrid')
plt.figure(figsize=(14,7))
sns.countplot(data['education'])
plt.figure(figsize = (10,5))
ax = sns.countplot(x='education', hue= 'y', data = data, palette = 'Set1')
ax.set_xticklabels(ax.get_xticklabels(), rotation = 40, ha = "right")
plt.tight_layout()
plt.show()
# HOUSING
sns.set_style('whitegrid')
plt.figure(figsize=(14,7))
sns.countplot(data['housing'])
plt.figure(figsize = (10,5))
ax = sns.countplot(x='housing', hue= 'y', data = data, palette = 'Set1')
ax.set_xticklabels(ax.get_xticklabels(), rotation = 40, ha = "right")
plt.tight_layout()
plt.show()
#LOAN
sns.set_style('whitegrid')
plt.figure(figsize=(14,7))
sns.countplot(data['loan'])
plt.figure(figsize = (10,5))
ax = sns.countplot(x='loan', hue= 'y', data = data, palette = 'Set1')
ax.set_xticklabels(ax.get_xticklabels(), rotation = 40, ha = "right")
plt.tight_layout()
plt.show()
#CONTACT
sns.set_style('whitegrid')
plt.figure(figsize=(14,7))
sns.countplot(data['contact'])
plt.figure(figsize = (10,5))
ax = sns.countplot(x='contact', hue= 'y', data = data, palette = 'Set1')
ax.set_xticklabels(ax.get_xticklabels(), rotation = 40, ha = "right")
plt.tight_layout()
plt.show()
#MONTH
sns.set_style('whitegrid')
plt.figure(figsize=(14,7))
sns.countplot(data['month'])
plt.figure(figsize = (10,5))
ax = sns.countplot(x='month', hue= 'y', data = data, palette = 'Set1')
ax.set_xticklabels(ax.get_xticklabels(), rotation = 40, ha = "right")
plt.tight_layout()
plt.show()
#DAY OF THE WEEK
sns.set_style('whitegrid')
plt.figure(figsize=(14,7))
sns.countplot(data['day_of_week'])
plt.figure(figsize = (10,5))
ax = sns.countplot(x='day_of_week', hue= 'y', data = data, palette = 'Set1')
ax.set_xticklabels(ax.get_xticklabels(), rotation = 40, ha = "right")
plt.tight_layout()
plt.show()
#POUTCOME
sns.set_style('whitegrid')
plt.figure(figsize=(14,7))
sns.countplot(data['poutcome'])
plt.figure(figsize = (10,5))
ax = sns.countplot(x='poutcome', hue= 'y', data = data, palette = 'Set1')
ax.set_xticklabels(ax.get_xticklabels(), rotation = 40, ha = "right")
plt.tight_layout()
plt.show()
sns.boxplot(data = data, x = 'y', y = 'age', hue = "y")
plt.tight_layout()
plt.figure(figsize=(10,8))
sns.distplot(data["age"])
%matplotlib inline
sns.boxplot(data=data, x="y", y="duration")
plt.show()
plt.figure(figsize=(10,8))
sns.distplot(data["duration"])
plt.show()
plt.figure(figsize=(12,10))
plt.subplot(2,1,1)
sns.countplot(x='campaign', hue = 'y', data = data, palette ='Set1')
plt.xlim(right = 10)
plt.xlabel('')
plt.subplot(2,1,2)
sns.countplot(x='campaign', hue = 'y', data = data, palette = 'Set1')
plt.xlim(left = 11)
plt.ylim(top = 30)
plt.xlabel('No of Campaigns', fontsize = 14)
plt.show()
sns.boxplot(data=data, x="y", y="campaign")
plt.show()
%matplotlib inline
plt.figure(figsize=(10,8))
sns.distplot(data["campaign"])
plt.show()
data['pdays'].unique()
data['pdays'].value_counts()
%matplotlib inline
sns.boxplot(data=data, x="y", y="pdays")
plt.show()
%matplotlib inline
plt.figure(figsize=(10,8))
sns.distplot(data[data["y"]=="yes"]["pdays"])
sns.distplot(data[data["y"]=="no"]["pdays"])
plt.show()
data["previous"].unique()
data["previous"].value_counts()
data[data["y"]=="yes"]["previous"].value_counts()
data[data["y"]=="no"]["previous"].value_counts()
%matplotlib inline
sns.boxplot(data=data, x="y", y="previous")
plt.show()
%matplotlib inline
plt.figure(figsize=(10,8))
sns.distplot(data["previous"])
plt.show()
%matplotlib inline
plt.figure(figsize=(10,8))
sns.distplot(data[data["y"]=="yes"]["previous"])
sns.distplot(data[data["y"]=="no"]["previous"])
plt.show()
# Marital
sns.set_style('whitegrid')
plt.figure(figsize=(14,7))
sns.countplot(data['previous'])
# Easy count plots:
sns.countplot(x='previous', hue= 'y', data = data, palette= 'Set1')
plt.show()
data["emp.var.rate"].value_counts()
%matplotlib inline
sns.boxplot(data=data, x="y", y="emp.var.rate")
plt.show()
%matplotlib inline
plt.figure(figsize=(10,8))
sns.distplot(data["emp.var.rate"])
plt.show()
%matplotlib inline
sns.boxplot(data=data, x="y", y="cons.price.idx")
plt.show()
%matplotlib inline
plt.figure(figsize=(10,8))
sns.distplot(data["cons.price.idx"])
plt.show()
%matplotlib inline
sns.boxplot(data=data, x="y", y="cons.conf.idx")
plt.show()
%matplotlib inline
plt.figure(figsize=(10,8))
sns.distplot(data["cons.conf.idx"])
plt.show()
%matplotlib inline
sns.boxplot(data=data, x="y", y="euribor3m")
plt.show()
%matplotlib inline
plt.figure(figsize=(10,8))
sns.distplot(data["euribor3m"])
plt.show()
%matplotlib inline
sns.boxplot(data=data, x="y", y="nr.employed")
plt.show()
%matplotlib inline
plt.figure(figsize=(10,8))
sns.distplot(data["nr.employed"])
plt.show()
# Cleaning the data and making it ready for modeling.
# Importing the libraries required for machine learning and preprocessing the data:
import pandas as pd
import pickle
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import roc_auc_score
# Dropping the model prediction column from the dataset
data = data.drop(columns = ['ModelPrediction'])
# Handling the duplicate data.
data_dups = data[data.duplicated(keep = "last")]
data_dups
#Getting the count of duplicated values
data_dups.shape
#### Looks like we have 12 duplicated rows. SO dropping the duplicate rows is adviced.
data = data.drop_duplicates()
data.shape
data_x = data.iloc[:, :-1]
print("Shape of X: ", data_x.shape)
data_y = data['y']
print('Shape of Y: ', data_y.shape)
X_rest, X_test, y_rest, y_test = train_test_split(data_x, data_y, test_size = 0.2)
X_train, X_cv, y_train, y_cv = train_test_split(X_rest, y_rest, test_size =0.2)
print("X Train:", X_train.shape)
print("X CV:", X_cv.shape)
print("X Test:", X_test.shape)
print("Y Train:", y_train.shape)
print("Y CV:", y_cv.shape)
print("Y Test:", y_test.shape)
# Label encoding- replacing the "Nos" with 0 and "Yes'" with 1
y_train.replace({'no' : 0, "yes" : 1}, inplace = True)
y_cv.replace({'no' : 0, "yes" : 1}, inplace = True)
y_test.replace({'no' : 0, "yes" : 1}, inplace = True)
# One big step before training any machine learning model is converting the categorical variables to numerical values.
# Some high efficient algos do but many machine learning algorithms cannot operate on label data directly.
# They require all input variables and output variables to be numeric.
# This means that categorical data must be converted to a numerical form.
# We perfrom this method when no natural ordering is present in the categorial values.
#Categorical condition to extract categorical columns
categorical_condn = data_x.dtypes == object
categorical_condn
# Filtering the categorical columns and printing out
categorical_cols = data_x.columns[categorical_condn].tolist()
categorical_cols
from sklearn.feature_extraction.text import CountVectorizer
def add_onehot_to_dataframe(sparse, df, vectorizer, name):
for i, col in enumerate(vectorizer.get_feature_names()):
colname = name+"_"+col
df[colname] = sparse[:, i].toarray().ravel().tolist()
return df
def OneHotEncoder(categorical_cols, X_train, X_test, X_cv = None, include_cv = False):
for i in categorical_cols:
Vectorizer = CountVectorizer(token_pattern="[A-Za-z0-9-.]+")
print("Encoding for feature: ", i)
temp_cols = Vectorizer.fit_transform(X_train[i])
X_train = add_onehot_to_dataframe(temp_cols, X_train, Vectorizer, i)
if include_cv:
temp_cols = Vectorizer.transform(X_cv[i])
X_cv = add_onehot_to_dataframe(temp_cols, X_cv, Vectorizer, i)
temp_cols = Vectorizer.transform(X_test[i])
X_test = add_onehot_to_dataframe(temp_cols, X_test, Vectorizer, i)
OneHotEncoder(categorical_cols, X_train, X_test, X_cv, True)
#Droping the categorical columns once encoding is done
X_train = X_train.drop(categorical_cols, axis = 1)
X_cv = X_cv.drop(categorical_cols, axis = 1)
X_test = X_test.drop(categorical_cols, axis = 1)
print("Shape of train:", X_train.shape)
print("Shape of CV:", X_cv.shape)
print("Shape of test:", X_test.shape)
X_train.info()
#Printing the dataset to ensure our whole dataset is numeric now.
X_train.head(5)
pd.set_option('display.max_columns', None)
X_train.head(5)
# Just a best practice step - saving the encoded dataset to local computer. (Can be used for small excell filtering too in needed)
data_x.to_csv("encoded_data_x.csv")
# Advice before building any model on this dataset is to drop the 'Duration' column , due to it's high correlation with the target variable.
# Creating a simple machine learning model - I've picked logistic regression model here for the reason that,
# it best fits our dataset in case of a textbook defination in my view.
# We have a binary classification model. so hence a logistic regression model.
# Building the model with 'Duration' column.
model = LogisticRegression(class_weight = 'balanced')
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_test)
print("AUC score:", roc_auc_score(y_test, y_pred[:,1]))
print(y_pred)
Y_ProbList =list(y_pred)
# To be ran
model.score(X_train,y_train)
data.shape
data_new = data.drop(columns = ['ModelPrediction'])
data_new.shape
contact = ({'cellular':0, 'telephone':1})
data_new['contact'] = data_new['contact'].map(contact)
data_new = pd.get_dummies(data_new, columns = ['job','marital','education','default',
'housing','loan','month',
'day_of_week','poutcome'], drop_first = True)
data_new.shape
# Importing required scikit learn's libraries
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
import warnings
warnings.filterwarnings('ignore')
print('.....Setup complete')
import os
#group data into X set and y set, where X has all features and y has the label
X = data_new.loc[:,data_new.columns != 'y']
y = data_new.loc[:,data_new.columns == 'y']
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size = 0.3 , random_state = 0)
#print lenght of both test and train set
print('shape of X_train : ',len(X_train), '\nshape of y_train : ',len(y_train))
print('\nshape of X_test : ',len(X_test), '\nshape of y_test : ',len(y_test))
#Normalize the data
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
#train model using logistic regression
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(X_train,y_train)
clf.score(X_train,y_train)
clf.score(X_test,y_test)
#predict X_test and save in variable y_pred
y_pred = clf.predict(X_test)
#Evaluate the model
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test,y_pred)
cm
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))